import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import style
import seaborn as sns
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
import graphviz as gr

style.use("fivethirtyeight")


data = pd.read_csv("./data/collections_email.csv")
data.head()


print("Difference in means:",
      data.query("email==1")["payments"].mean() - data.query("email==0")["payments"].mean())

model = smf.ols('payments ~ email', data=data).fit()
model.summary().tables[1]

Difference in means: -0.6202804021329484


sns.scatterplot("email", "payments", 
                alpha=0.8,
                data=data.assign(email=data["email"] + np.random.normal(0, 0.01, size=len(data["email"]))))
plt.plot(np.linspace(-0.2, 1.2), model.params[0] + np.linspace(-1, 2) * model.params[1], c="C1")
plt.xlabel("Email")
plt.ylabel("Payments");


model_email = smf.ols('email ~ credit_limit + risk_score', data=data).fit()
model_payments = smf.ols('payments ~ credit_limit + risk_score', data=data).fit()

residuals = pd.DataFrame(dict(res_payments=model_payments.resid, res_email=model_email.resid))

model_treatment = smf.ols('res_payments ~ res_email', data=residuals).fit()


print("Payments Variance", np.var(data["payments"]))
print("Payments Residual Variance", np.var(residuals["res_payments"]))

print("Email Variance", np.var(data["email"]))
print("Email Residual Variance", np.var(residuals["res_email"]))

model_treatment.summary().tables[1]

Payments Variance 10807.61241599994
Payments Residual Variance 5652.453558466202
Email Variance 0.24991536000001294
Email Residual Variance 0.24918421069820038


sns.scatterplot("res_email", "res_payments", data=residuals)
plt.plot(np.linspace(-0.7, 1), model_treatment.params[0] + np.linspace(-1, 2) * model_treatment.params[1], c="C1")
plt.xlabel("Email Residuals")
plt.ylabel("Payments Residuals");


model_2 = smf.ols('payments ~ email + credit_limit + risk_score', data=data).fit()
model_2.summary().tables[1]


g = gr.Digraph()
g.edge("X", "Y"), g.edge("T", "Y")
g.node("T", color="gold")

g.node("电子邮件", color="gold")
g.edge("信用额度", "付款")
g.edge("风险分", "付款")
g.edge("电子邮件", "付款")

g


hospital = pd.read_csv("./data/hospital_treatment.csv")
hospital.head()


hosp_1 = smf.ols('days ~ treatment', data=hospital).fit()
hosp_1.summary().tables[1]


hosp_2 = smf.ols('days ~ treatment', data=hospital.query("hospital==0")).fit()
hosp_2.summary().tables[1]


hosp_3 = smf.ols('days ~ treatment', data=hospital.query("hospital==1")).fit()
hosp_3.summary().tables[1]


hosp_4 = smf.ols('days ~ treatment + severity', data=hospital).fit()
hosp_4.summary().tables[1]


hosp_5 = smf.ols('days ~ treatment + severity + hospital', data=hospital).fit()
hosp_5.summary().tables[1]


model_treatment = smf.ols('treatment ~ severity + hospital', data=hospital).fit()
model_days = smf.ols('days ~ severity + hospital', data=hospital).fit()

residuals = pd.DataFrame(dict(res_days=model_days.resid, res_treatment=model_treatment.resid))

model_treatment = smf.ols('res_days ~ res_treatment', data=residuals).fit()

model_treatment.summary().tables[1]


print("Treatment Variance", np.var(hospital["treatment"]))
print("Treatment Residual Variance", np.var(residuals["res_treatment"]))

Treatment Variance 0.234375
Treatment Residual Variance 0.05752909187211909


sigma_hat = sum(model_treatment.resid**2)/(len(model_treatment.resid)-2)
var = sigma_hat/sum((residuals["res_treatment"] - residuals["res_treatment"].mean())**2)
print("SE of the Coeficient:", np.sqrt(var))

SE of the Coeficient: 3.4469737674869023


g = gr.Digraph()

g.edge("X", "T"), g.edge("T", "Y")
g.node("T", color="gold")

g.node("治疗", color="gold")
g.edge("严重程度", "医院")
g.edge("严重程度", "天数")
g.edge("医院", "治疗")
g.edge("治疗", "天数")

g


email_1 = smf.ols('payments ~ email + credit_limit + risk_score', data=data).fit()
email_1.summary().tables[1]


email_2 = smf.ols('payments ~ email + credit_limit + risk_score + opened + agreement', data=data).fit()
email_2.summary().tables[1]


g = gr.Digraph()

g.edge("电子邮件", "支付")
g.edge("电子邮件", "打开")
g.edge("电子邮件", "支付协议")
g.edge("打开", "支付")
g.edge("打开", "支付协议")
g.edge("支付协议", "支付")

g.edge("信用额度", "支付")
g.edge("信用额度", "打开")
g.edge("信用额度", "支付协议")
g.edge("风险分", "支付")
g.edge("风险分", "打开")
g.edge("风险分", "支付协议")

g


plt.hist(np.concatenate([
    np.random.gamma(5, 50, 1000), 
    np.zeros(700)
]), bins=20)
plt.xlabel("Customer Spend")
plt.title("Distribution Customer Spend");


g = gr.Digraph()

g.edge("T", "X_1"), g.node("T", color="gold"), g.edge("X_1", "Y"), g.node("X_1", color="red")
g.edge("T", "X_2"), g.edge("Y", "X_2"), g.node("X_2", color="red")

g

	payments	email	opened	agreement	credit_limit	risk_score
0	740	1	1.0	0.0	2348.495260	0.666752
1	580	1	1.0	1.0	334.111969	0.207395
2	600	1	1.0	1.0	1360.660722	0.550479
3	770	0	0.0	0.0	1531.828576	0.560488
4	660	0	0.0	0.0	979.855647	0.455140

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	669.9764	2.061	325.116	0.000	665.937	674.016
email	-0.6203	2.941	-0.211	0.833	-6.387	5.146

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	2.931e-14	1.063	2.76e-14	1.000	-2.084	2.084
res_email	4.4304	2.129	2.080	0.038	0.256	8.605

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	490.8653	9.715	50.527	0.000	471.820	509.911
email	4.4304	2.130	2.080	0.038	0.255	8.606
credit_limit	0.1511	0.008	18.833	0.000	0.135	0.167
risk_score	-8.0516	38.424	-0.210	0.834	-83.379	67.276

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	33.2667	2.662	12.498	0.000	27.968	38.566
treatment	14.1533	3.367	4.204	0.000	7.451	20.856

控制混淆因素之外的考虑¶

良好的控制¶

案例1¶

案例2¶

不良控制 - 选择偏差¶

糟糕的 COP¶

关键思想¶

	hospital	treatment	severity	days
0	1	1	29.686618	82
1	1	1	20.050340	57
2	1	1	20.302399	49
3	0	0	10.603118	44
4	0	0	8.332793	15

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	30.4074	2.868	10.602	0.000	24.523	36.292
treatment	-11.4074	10.921	-1.045	0.306	-33.816	11.001

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	59.0000	6.747	8.745	0.000	45.442	72.558
treatment	-10.3958	6.955	-1.495	0.141	-24.371	3.580

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	11.6641	2.000	5.832	0.000	7.681	15.647
treatment	-7.5912	2.269	-3.345	0.001	-12.110	-3.073
severity	2.2741	0.154	14.793	0.000	1.968	2.580

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	11.0111	2.118	5.198	0.000	6.792	15.230
treatment	-5.0945	3.492	-1.459	0.149	-12.049	1.861
severity	2.3865	0.195	12.251	0.000	1.999	2.774
hospital	-4.1535	4.413	-0.941	0.350	-12.943	4.636